Source Code of org.terrier.structures.indexing.singlepass.hadoop.CollectionRecordReader

/*
 * Terrier - Terabyte Retriever
 * Webpage: http://terrier.org
 * Contact: terrier{a.}dcs.gla.ac.uk
 * University of Glasgow - School of Computing Science
 * http://www.gla.uk
 *
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is CollectionRecordReader.java.
 *
 * The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
 * All Rights Reserved.
 *
 * Contributor(s):
 *   Craig Macdonald <craigm{a.}dcs.gla.ac.uk> (original author)
 *   Richard McCreadie <richardm{a.}dcs.gla.ac.uk> 
 *   
 */


package org.terrier.structures.indexing.singlepass.hadoop;


import java.io.IOException;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;


import org.terrier.indexing.Collection;
import org.terrier.indexing.Document;
import org.terrier.utility.io.HadoopUtility;
import org.terrier.utility.io.WrappedIOException;


/** An abstract RecordReader class which provides methods to read a collection
 * within the Hadoop framework. Note that the collection will be split based on
 * a predetermined InputSplit type which must contain positional information, i.e.
 * which split it is in the list of all splits.
 * @author Craig Madonald and Richard McCreadie
  * @param <SPLITTYPE> The subclass of InputSplit that this class should work with
 */
@SuppressWarnings("deprecation")
public abstract class CollectionRecordReader<SPLITTYPE extends PositionAwareSplit<?>>
  implements RecordReader<Text, SplitAwareWrapper<Document>>
{
  /** document collection currently being iterated through. starts as null */
  protected Collection documentCollection = null;
  /** the files in this split */
  protected SPLITTYPE split;
  /** the configuration of this job */
  protected Configuration config;
  
  /** the number of documents extacted thus far */
  protected int currentDocument;


  /** number of collections obtained thus far by this record reader */
  protected int collectionIndex;
  /** 
   * constructor
   * @param _jobConf
   * @param _split
   * @throws IOException
   */
  public CollectionRecordReader(JobConf _jobConf, SPLITTYPE _split) throws IOException {
    this.config = _jobConf;
    this.split = _split;
    this.collectionIndex = 0;
    this.currentDocument = 0;
    try{
      HadoopUtility.loadTerrierJob(_jobConf);
    } catch (Exception e) {
      throw new WrappedIOException("Cannot load ApplicationSetup", e);
    }
  }


  /**
   * Closes the document collection if it exists
   */
  public void close() throws IOException {
    closeCollectionSplit();
  }


  /** Create a new Key, each key
   * is a Document Number
   */
  public Text createKey() {
    return new Text();
  }


  /** Create a new Text value,
   * each value is a document
   */
  public SplitAwareWrapper<Document> createValue() {
    return new SplitAwareWrapper<Document>(split.getSplitIndex());
  }
  
  /**
   * Returns the number of bits the recordreader has
   * accessed, thereby giving the position in
   * the input data.
   */
  public abstract long getPos() throws IOException;


  /**
   * Returns the progress of the reading
   */
  public abstract float getProgress() throws IOException;


  /** Moves to the next Document in the Collections accessing this InputSplit
   * if one exists, setting DocID to the property
   * "DOCID" and Document to the text within the
   * document. Returns true iff there was indeed a document to read,
   * and hence this document is now in the DocID and document arguments.
   */
  @Override
  public boolean next(Text DocID, SplitAwareWrapper<Document> document) throws IOException
  {   
    while(true)
    {
      if (documentCollection == null)
      {  
        documentCollection = openCollectionSplit(collectionIndex);
        if (documentCollection == null)//no files in this split
          return false;
      }
      assert documentCollection != null;
      while (documentCollection.nextDocument()) 
      {
        // Get Document
        Document tempDoc = documentCollection.getDocument();
        if (tempDoc != null)
        {
          // Retrieve Document's Unique ID
          if (tempDoc.getProperty("docno") == null)
          {
            throw new IOException("Collection returned null as docno");
          }
          else
          {
            DocID.set(tempDoc.getProperty("docno"));
          }
          document.setObject(tempDoc);
          currentDocument++;
          return true;
        }        
      }
      //no more documents in documentCollection
      closeCollectionSplit();
      collectionIndex++;
      documentCollection = null;
    }
  }


  /** open a collection for the index'th parth of the current split */
  protected abstract Collection openCollectionSplit(int index) throws IOException;
  
  /** closes the current collection */
  protected void closeCollectionSplit() throws IOException {
    if (documentCollection != null) 
      documentCollection.close();
  }


}
Source Code of org.terrier.structures.indexing.singlepass.hadoop.CollectionRecordReader

Related Classes of org.terrier.structures.indexing.singlepass.hadoop.CollectionRecordReader